In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

load data¶

In [2]:
train_raw_df = pd.read_csv("../dataset/spaceship-titanic/train.csv")
test_raw_df = pd.read_csv("../dataset/spaceship-titanic/test.csv")
In [3]:
train_process_df = train_raw_df.copy()

Data Exploration¶

In [4]:
def explore_data(column, info=True, chart=[], chart_column="Transported"):
    column_data = train_raw_df[column]
    
    if info:
        print("------- Column Info: -------")
        print(train_raw_df[column].info())

        print("------- Data Counts: -------")
        print(train_raw_df[column].value_counts())

        print("------- Null Check: -------")
        print(train_raw_df[column].isnull().sum())

        print("------- Describe -------")
        print(train_raw_df[column].describe())
    
    if "bar" in chart:
        print(f"------- Bar Plot {column} vs {chart_column}: -------")
        fig = px.bar(train_raw_df, x=column, color=chart_column, barmode="group", 
                     color_discrete_sequence=px.colors.qualitative.Antique)
        fig.update_traces(dict(marker_line_width=0))
        fig.show()
    if "pie" in chart:
        print(f"------- Pie Chart {column} Count plot: -------")
        values= train_raw_df[column].value_counts(dropna=False)
        values_dict =values.to_dict()
        fig = px.pie(values=list(values_dict.values()), names=list(values_dict.keys()),
                    color_discrete_sequence=px.colors.qualitative.Antique)
        fig.update_traces(dict(marker_line_width=0))
        fig.show()
    if "hist" in chart:
        print(f"------- Hist Chart {column} : -------")
        fig = px.histogram(x=train_raw_df[column], barmode="group",
                          color_discrete_sequence=px.colors.qualitative.Antique)
        fig.update_layout(bargap=0.1)
        fig.update_traces(dict(marker_line_width=0))
        fig.show()
        
    if "box" in chart:
        fig = px.box(train_raw_df, y=column, color_discrete_sequence=px.colors.qualitative.Antique)
        fig.update_traces(dict(marker_line_width=0))
        fig.show()
        
    if info:
        return column_data
In [5]:
def change_data(column, type_cast=dict()):
    if type_cast:
        train_process_df[column] = train_process_df[column].astype(type_cast["to"])

Data-Column Exploration¶

In [6]:
test_raw_df
Out[6]:
PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck Name
0 0013_01 Earth True G/3/S TRAPPIST-1e 27.0 False 0.0 0.0 0.0 0.0 0.0 Nelly Carsoning
1 0018_01 Earth False F/4/S TRAPPIST-1e 19.0 False 0.0 9.0 0.0 2823.0 0.0 Lerome Peckers
2 0019_01 Europa True C/0/S 55 Cancri e 31.0 False 0.0 0.0 0.0 0.0 0.0 Sabih Unhearfus
3 0021_01 Europa False C/1/S TRAPPIST-1e 38.0 False 0.0 6652.0 0.0 181.0 585.0 Meratz Caltilter
4 0023_01 Earth False F/5/S TRAPPIST-1e 20.0 False 10.0 0.0 635.0 0.0 0.0 Brence Harperez
... ... ... ... ... ... ... ... ... ... ... ... ... ...
4272 9266_02 Earth True G/1496/S TRAPPIST-1e 34.0 False 0.0 0.0 0.0 0.0 0.0 Jeron Peter
4273 9269_01 Earth False NaN TRAPPIST-1e 42.0 False 0.0 847.0 17.0 10.0 144.0 Matty Scheron
4274 9271_01 Mars True D/296/P 55 Cancri e NaN False 0.0 0.0 0.0 0.0 0.0 Jayrin Pore
4275 9273_01 Europa False D/297/P NaN NaN False 0.0 2680.0 0.0 0.0 523.0 Kitakan Conale
4276 9277_01 Earth True G/1498/S PSO J318.5-22 43.0 False 0.0 0.0 0.0 0.0 0.0 Lilace Leonzaley

4277 rows × 13 columns

In [7]:
train_raw_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
In [8]:
train_raw_df.head()
Out[8]:
PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck Name Transported
0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy False
1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False 109.0 9.0 25.0 549.0 44.0 Juanna Vines True
2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True 43.0 3576.0 0.0 6715.0 49.0 Altark Susent False
3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False 0.0 1283.0 371.0 3329.0 193.0 Solam Susent False
4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False 303.0 70.0 151.0 565.0 2.0 Willy Santantines True
In [9]:
train_raw_df.Transported.value_counts()
Out[9]:
True     4378
False    4315
Name: Transported, dtype: int64

PassengerId¶

In [10]:
explore_data("PassengerId", chart=["box"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: PassengerId
Non-Null Count  Dtype 
--------------  ----- 
8693 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
0001_01    1
6136_01    1
6141_01    1
6139_06    1
6139_05    1
          ..
3126_01    1
3124_03    1
3124_02    1
3124_01    1
9280_02    1
Name: PassengerId, Length: 8693, dtype: int64
------- Null Check: -------
0
------- Describe -------
count        8693
unique       8693
top       0001_01
freq            1
Name: PassengerId, dtype: object
Out[10]:
0       0001_01
1       0002_01
2       0003_01
3       0003_02
4       0004_01
         ...   
8688    9276_01
8689    9278_01
8690    9279_01
8691    9280_01
8692    9280_02
Name: PassengerId, Length: 8693, dtype: object
In [11]:
train_raw_df.PassengerId.sample(10)
Out[11]:
1632    1728_01
1753    1865_02
8450    9026_01
7673    8190_01
4375    4656_01
2516    2703_01
7563    8084_01
5910    6268_01
3808    4066_02
3796    4050_01
Name: PassengerId, dtype: object
In [12]:
train_raw_df.PassengerId.str.split("_", expand=True)
Out[12]:
0 1
0 0001 01
1 0002 01
2 0003 01
3 0003 02
4 0004 01
... ... ...
8688 9276 01
8689 9278 01
8690 9279 01
8691 9280 01
8692 9280 02

8693 rows × 2 columns

In [13]:
train_raw_df.PassengerId.str.split("_", expand=True).iloc[:, 0].unique()
Out[13]:
array(['0001', '0002', '0003', ..., '9278', '9279', '9280'], dtype=object)
In [14]:
len(train_raw_df.PassengerId.str.split("_", expand=True).iloc[:, 0].unique())
Out[14]:
6217

HomePlanet¶

In [15]:
explore_data("HomePlanet", chart=["pie", "bar"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: HomePlanet
Non-Null Count  Dtype 
--------------  ----- 
8492 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64
------- Null Check: -------
201
------- Describe -------
count      8492
unique        3
top       Earth
freq       4602
Name: HomePlanet, dtype: object
------- Bar Plot HomePlanet vs Transported: -------
------- Pie Chart HomePlanet Count plot: -------
Out[15]:
0       Europa
1        Earth
2       Europa
3       Europa
4        Earth
         ...  
8688    Europa
8689     Earth
8690     Earth
8691    Europa
8692    Europa
Name: HomePlanet, Length: 8693, dtype: object

CryoSleep¶

In [16]:
explore_data("CryoSleep")
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: CryoSleep
Non-Null Count  Dtype 
--------------  ----- 
8476 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
False    5439
True     3037
Name: CryoSleep, dtype: int64
------- Null Check: -------
217
------- Describe -------
count      8476
unique        2
top       False
freq       5439
Name: CryoSleep, dtype: object
Out[16]:
0       False
1       False
2       False
3       False
4       False
        ...  
8688    False
8689     True
8690    False
8691    False
8692    False
Name: CryoSleep, Length: 8693, dtype: object
In [17]:
explore_data("CryoSleep", info=False, chart=["bar"])
------- Bar Plot CryoSleep vs Transported: -------
In [18]:
explore_data("CryoSleep", info=False, chart=["bar"], chart_column="VIP")
------- Bar Plot CryoSleep vs VIP: -------

Cabin¶

In [19]:
explore_data("Cabin")
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Cabin
Non-Null Count  Dtype 
--------------  ----- 
8494 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: Cabin, Length: 6560, dtype: int64
------- Null Check: -------
199
------- Describe -------
count        8494
unique       6560
top       G/734/S
freq            8
Name: Cabin, dtype: object
Out[19]:
0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: object
In [20]:
train_process_df["deck"] = train_raw_df.Cabin.str.split("/", expand=True)[0]
train_process_df["Num"] = train_raw_df.Cabin.str.split("/", expand=True)[1]
train_process_df["Side"] = train_raw_df.Cabin.str.split("/", expand=True)[2]
In [21]:
train_process_df
Out[21]:
PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck Name Transported deck Num Side
0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy False B 0 P
1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False 109.0 9.0 25.0 549.0 44.0 Juanna Vines True F 0 S
2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True 43.0 3576.0 0.0 6715.0 49.0 Altark Susent False A 0 S
3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False 0.0 1283.0 371.0 3329.0 193.0 Solam Susent False A 0 S
4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False 303.0 70.0 151.0 565.0 2.0 Willy Santantines True F 1 S
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8688 9276_01 Europa False A/98/P 55 Cancri e 41.0 True 0.0 6819.0 0.0 1643.0 74.0 Gravior Noxnuther False A 98 P
8689 9278_01 Earth True G/1499/S PSO J318.5-22 18.0 False 0.0 0.0 0.0 0.0 0.0 Kurta Mondalley False G 1499 S
8690 9279_01 Earth False G/1500/S TRAPPIST-1e 26.0 False 0.0 0.0 1872.0 1.0 0.0 Fayey Connon True G 1500 S
8691 9280_01 Europa False E/608/S 55 Cancri e 32.0 False 0.0 1049.0 0.0 353.0 3235.0 Celeon Hontichre False E 608 S
8692 9280_02 Europa False E/608/S TRAPPIST-1e 44.0 False 126.0 4688.0 0.0 0.0 12.0 Propsh Hontichre True E 608 S

8693 rows × 17 columns

In [22]:
train_process_df["Num"] = train_process_df.Num.astype("Int64")
In [23]:
train_process_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  deck          8494 non-null   object 
 15  Num           8494 non-null   Int64  
 16  Side          8494 non-null   object 
dtypes: Int64(1), bool(1), float64(6), object(9)
memory usage: 1.1+ MB

Destination¶

In [24]:
explore_data("Destination", chart=["pie", "bar"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Destination
Non-Null Count  Dtype 
--------------  ----- 
8511 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64
------- Null Check: -------
182
------- Describe -------
count            8511
unique              3
top       TRAPPIST-1e
freq             5915
Name: Destination, dtype: object
------- Bar Plot Destination vs Transported: -------
------- Pie Chart Destination Count plot: -------
Out[24]:
0         TRAPPIST-1e
1         TRAPPIST-1e
2         TRAPPIST-1e
3         TRAPPIST-1e
4         TRAPPIST-1e
            ...      
8688      55 Cancri e
8689    PSO J318.5-22
8690      TRAPPIST-1e
8691      55 Cancri e
8692      TRAPPIST-1e
Name: Destination, Length: 8693, dtype: object

Age¶

In [25]:
explore_data("Age", chart=["hist", "box"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Age
Non-Null Count  Dtype  
--------------  -----  
8514 non-null   float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
24.0    324
18.0    320
21.0    311
19.0    293
23.0    292
       ... 
72.0      4
78.0      3
79.0      3
76.0      2
77.0      2
Name: Age, Length: 80, dtype: int64
------- Null Check: -------
179
------- Describe -------
count    8514.000000
mean       28.827930
std        14.489021
min         0.000000
25%        19.000000
50%        27.000000
75%        38.000000
max        79.000000
Name: Age, dtype: float64
------- Hist Chart Age : -------
Out[25]:
0       39.0
1       24.0
2       58.0
3       33.0
4       16.0
        ... 
8688    41.0
8689    18.0
8690    26.0
8691    32.0
8692    44.0
Name: Age, Length: 8693, dtype: float64

VIP¶

In [26]:
explore_data("VIP", chart=["bar", "pie"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: VIP
Non-Null Count  Dtype 
--------------  ----- 
8490 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
False    8291
True      199
Name: VIP, dtype: int64
------- Null Check: -------
203
------- Describe -------
count      8490
unique        2
top       False
freq       8291
Name: VIP, dtype: object
------- Bar Plot VIP vs Transported: -------
------- Pie Chart VIP Count plot: -------
Out[26]:
0       False
1       False
2        True
3       False
4       False
        ...  
8688     True
8689    False
8690    False
8691    False
8692    False
Name: VIP, Length: 8693, dtype: object

RoomService¶

In [27]:
explore_data("RoomService", chart=["box", "hist"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: RoomService
Non-Null Count  Dtype  
--------------  -----  
8512 non-null   float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
0.0       5577
1.0        117
2.0         79
3.0         61
4.0         47
          ... 
1612.0       1
2598.0       1
632.0        1
378.0        1
745.0        1
Name: RoomService, Length: 1273, dtype: int64
------- Null Check: -------
181
------- Describe -------
count     8512.000000
mean       224.687617
std        666.717663
min          0.000000
25%          0.000000
50%          0.000000
75%         47.000000
max      14327.000000
Name: RoomService, dtype: float64
------- Hist Chart RoomService : -------
Out[27]:
0         0.0
1       109.0
2        43.0
3         0.0
4       303.0
        ...  
8688      0.0
8689      0.0
8690      0.0
8691      0.0
8692    126.0
Name: RoomService, Length: 8693, dtype: float64

FoodCourt¶

In [28]:
explore_data("FoodCourt", chart=["box"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: FoodCourt
Non-Null Count  Dtype  
--------------  -----  
8510 non-null   float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
0.0       5456
1.0        116
2.0         75
3.0         53
4.0         53
          ... 
3846.0       1
5193.0       1
312.0        1
827.0        1
4688.0       1
Name: FoodCourt, Length: 1507, dtype: int64
------- Null Check: -------
183
------- Describe -------
count     8510.000000
mean       458.077203
std       1611.489240
min          0.000000
25%          0.000000
50%          0.000000
75%         76.000000
max      29813.000000
Name: FoodCourt, dtype: float64
Out[28]:
0          0.0
1          9.0
2       3576.0
3       1283.0
4         70.0
         ...  
8688    6819.0
8689       0.0
8690       0.0
8691    1049.0
8692    4688.0
Name: FoodCourt, Length: 8693, dtype: float64

ShoppingMall¶

In [29]:
explore_data("ShoppingMall", chart=["box"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: ShoppingMall
Non-Null Count  Dtype  
--------------  -----  
8485 non-null   float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
0.0       5587
1.0        153
2.0         80
3.0         59
4.0         45
          ... 
3627.0       1
2074.0       1
871.0        1
742.0        1
1872.0       1
Name: ShoppingMall, Length: 1115, dtype: int64
------- Null Check: -------
208
------- Describe -------
count     8485.000000
mean       173.729169
std        604.696458
min          0.000000
25%          0.000000
50%          0.000000
75%         27.000000
max      23492.000000
Name: ShoppingMall, dtype: float64
Out[29]:
0          0.0
1         25.0
2          0.0
3        371.0
4        151.0
         ...  
8688       0.0
8689       0.0
8690    1872.0
8691       0.0
8692       0.0
Name: ShoppingMall, Length: 8693, dtype: float64

Spa¶

In [30]:
explore_data("Spa", chart=["box"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Spa
Non-Null Count  Dtype  
--------------  -----  
8510 non-null   float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
0.0       5324
1.0        146
2.0        105
5.0         53
3.0         53
          ... 
273.0        1
2581.0       1
2948.0       1
3778.0       1
1643.0       1
Name: Spa, Length: 1327, dtype: int64
------- Null Check: -------
183
------- Describe -------
count     8510.000000
mean       311.138778
std       1136.705535
min          0.000000
25%          0.000000
50%          0.000000
75%         59.000000
max      22408.000000
Name: Spa, dtype: float64
Out[30]:
0          0.0
1        549.0
2       6715.0
3       3329.0
4        565.0
         ...  
8688    1643.0
8689       0.0
8690       1.0
8691     353.0
8692       0.0
Name: Spa, Length: 8693, dtype: float64

VRDeck¶

In [31]:
explore_data("VRDeck", chart=["box", "hist"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: VRDeck
Non-Null Count  Dtype  
--------------  -----  
8505 non-null   float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
0.0       5495
1.0        139
2.0         70
3.0         56
5.0         51
          ... 
408.0        1
876.0        1
2891.0       1
2102.0       1
3235.0       1
Name: VRDeck, Length: 1306, dtype: int64
------- Null Check: -------
188
------- Describe -------
count     8505.000000
mean       304.854791
std       1145.717189
min          0.000000
25%          0.000000
50%          0.000000
75%         46.000000
max      24133.000000
Name: VRDeck, dtype: float64
------- Hist Chart VRDeck : -------
Out[31]:
0          0.0
1         44.0
2         49.0
3        193.0
4          2.0
         ...  
8688      74.0
8689       0.0
8690       0.0
8691    3235.0
8692      12.0
Name: VRDeck, Length: 8693, dtype: float64

Name¶

In [32]:
explore_data("Name")
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Name
Non-Null Count  Dtype 
--------------  ----- 
8493 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
Gollux Reedall        2
Elaney Webstephrey    2
Grake Porki           2
Sus Coolez            2
Apix Wala             2
                     ..
Jamela Griffy         1
Hardy Griffy          1
Salley Mckinn         1
Mall Frasp            1
Propsh Hontichre      1
Name: Name, Length: 8473, dtype: int64
------- Null Check: -------
200
------- Describe -------
count               8493
unique              8473
top       Gollux Reedall
freq                   2
Name: Name, dtype: object
Out[32]:
0         Maham Ofracculy
1            Juanna Vines
2           Altark Susent
3            Solam Susent
4       Willy Santantines
              ...        
8688    Gravior Noxnuther
8689      Kurta Mondalley
8690         Fayey Connon
8691     Celeon Hontichre
8692     Propsh Hontichre
Name: Name, Length: 8693, dtype: object
In [ ]: